Automotive crashes continue to be one of the main reasons for American deaths. After seeing a decline in traffic fatalities for many years, 2015 saw an uptick in accidents. Many factors contributed to a higher number of accidents.
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
#Plotly is an online analytics and data visualization tool
import chart_studio.plotly as py
import plotly.graph_objects as go
from plotly import tools
from plotly.offline import iplot, init_notebook_mode
import plotly
plotly.offline.init_notebook_mode()
The dataset for 2015 US traffic accidents (accident.csv) is located in the ./Data folder
# read the csv file by choosing specific columns
accident_data = pd.read_csv('../EDA/Data/accident.csv', usecols=[0, 1, 8, 11, 12, 13, 25, 26, 28, 38, 50, 51])
# rename some columns
accident_data = accident_data.rename(
columns={'ST_CASE':'case_id', 'LONGITUD':'longitude', 'HARM_EV': 'harmful_event',
'DRUNK_DR':'drunk_drivers'})
# capitalize column names
accident_data.columns = accident_data.columns.str.capitalize()
# create a new column 'Date' by combining three original columns: 'Day', 'Month', 'Year'
accident_data['Date'] = pd.to_datetime(accident_data[['Day', 'Month', 'Year']])
# drop the columns: 'Day', 'Month', 'Year'
accident_data = accident_data.drop(['Day', 'Month', 'Year'], axis=1)
# reorder the columns
accident_data = accident_data[['Case_id', 'Date', 'State', 'Latitude', 'Longitude', 'Weather', 'Harmful_event', 'Persons', 'Fatals', 'Drunk_drivers']]
# sort on date
accident_data = accident_data.sort_values('Date')
# show the first 5 rows
accident_data.head()
# add a new column 'Text', which will be displayed on the figure
accident_data['Text'] = accident_data['Date'].dt.strftime('%Y-%m-%d') + ', ' + accident_data['Persons'].astype(str) + ' involved'
# data to be displayed on the figure
data = [dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = accident_data[accident_data['Longitude'] < 0]['Longitude'],
lat = accident_data[accident_data['Longitude'] < 0]['Latitude'],
text = accident_data[accident_data['Longitude'] < 0]['Text'],
mode = 'markers',
# define the marker size, color, and style
marker = dict(
size = accident_data[accident_data['Longitude'] < 0]['Drunk_drivers'] * 5,
opacity = 0.8,
color = 'rgb(255, 0, 250)')
)]
# the layout of the figure
layout = dict(
title = 'Number of Persons Involved in Traffic Accidents in USA in 2015<br>'
'<sub>Hover to view the details</sub>',
geo = dict(
scope = 'usa',
projection = dict(type = 'albers usa'),
showland = True,
landcolor = 'rgb(250, 250, 250)',
subunitwidth = 1,
subunitcolor = 'rgb(217, 217, 217)',
countrywidth = 1,
countrycolor = 'rgb(217, 217, 217)',
showlakes = True,
lakecolor = 'rgb(255, 255, 255)')
)
# show the figure
figure = dict(data = data, layout = layout)
iplot(figure)
# US states
us_states = np.asarray(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'])
# total fatals per state
fatals_perstate = accident_data.groupby('State')['Fatals'].sum().values
# color scale
color_scale = [[0, 'rgb(0, 255, 0)'], [1, 'rgb(255, 0, 0)']]
#A choropleth map is similar to heat map
data = [dict(
type = 'choropleth',
autocolorscale = False,
colorscale = color_scale,
showscale = False,
locations = us_states,
locationmode = 'USA-states',
z = fatals_perstate,
marker = dict(
line = dict(
color = 'rgb(200, 200, 200)',
width = 3)
)
)]
layout = dict(
title = 'Number of Fatals Involved in Traffic Accidents by State in USA in 2015<br>'
'<sub>Hover to view the details</sub>',
geo = dict(
scope = 'usa',
projection = dict(type = 'albers usa'),
countrycolor = 'rgb(255, 255, 255)',
showlakes = True,
lakecolor = 'rgb(255, 255, 255)')
)
figure = dict(data = data, layout = layout)
iplot(figure)
# number of traffic accidents by date
accident_perdate = accident_data.groupby('Date')['Case_id'].count().values
# thirty day moving average of traffic fatalites by date
accident_average = pd.Series(accident_perdate).rolling(window=20).mean()
# drop the first 19 days, and then convert it to numpy array
accident_average = accident_average.drop(accident_average.index[:19]).values.round()
# numpy array of period of dates [2015-01-01, 2015-12-31]
accident_dates = np.arange('2015-01', '2016-01', dtype='datetime64[D]')
# numpy array of period of dates [2015-01-11, 2015-12-22]
accident_range = accident_dates[10:356]
fig = go.Figure()
# go == plotly.graph_objects
# scatter graph object #1
fig.add_trace(
go.Scatter(
x = accident_dates,
y = accident_perdate,
mode = 'lines',
name = 'Fatalities',
line = dict(
color = 'rgb(215, 0, 0)',
width = 3)
))
# scatter graph object #2
fig.add_trace(
go.Scatter(
x = accident_range,
y = accident_average,
mode = 'lines',
name = 'Average',
line = dict(
color = 'rgb(0, 0, 255)',
width = 5),
opacity = 0.33
))
fig.show()
# weather mapping
map_weather = {0: 'No additional atmospheric condition', 1:'Clear', 2:'Rain', 3:'Freezing rain or drizzle',
4:'Snow', 5:'Fog, smog, smoke', 6:'Severe crosswinds', 7:'Blowing sand, soil, dirt',
8:'Other', 10:'Cloudy', 11:'Blowing snow', 12:'Freezing Rain or Drizzle', 98:'Not reported', 99:'Unknown'}
# accidents count grouped by weather condition
accident_weather_counts = accident_data[accident_data['Weather']<15].groupby('Weather')['Case_id'].count().sort_values(ascending=False)
# x-axis displays the accident counts by weather condition
x_values = accident_weather_counts.values
# the percentage of accidents for each weather condition
weather_percent = np.round(x_values / sum(x_values) * 100, 2).astype(str)
weather_percent = np.array([s+'%' for s in weather_percent])
# y-axis displays the weather conditions in string
y_values = pd.Series(accident_weather_counts.index).map(map_weather).values
# Bar graphic-object
data = [go.Bar(
x = x_values,
y = y_values,
text = weather_percent,
orientation = 'h',
hoverinfo = 'y+text',
marker = dict(
color = 'rgb(200, 0, 200)')
)]
# the layout
layout = go.Layout(
title = 'Number of Traffic Accidents by Weather Condition in USA in 2015<br>'
'<sub>Hover to view the details</sub>',
xaxis = dict(
showgrid = False,
showticklabels = False
),
autosize = False,
margin = dict(
autoexpand = False,
l = 200, r = 40, pad = 5
),
annotations = [
dict(x = x, y = y,
text = str(x),
xanchor = 'left',
yanchor = 'middle',
showarrow = False) for x, y in zip(x_values, y_values)]
)
figure = dict(data = data, layout = layout)
iplot(figure)
# accident counts grouped by harmful events
accident_harmful_events = accident_data.groupby('Harmful_event')['Case_id'].count().sort_values(ascending=False)
# total accident counts
total_accidents = accident_harmful_events.sum()
# accident counts by top-10 harmful events
accident_harmful_events_top10 = accident_harmful_events[:10]
# accident percentage by top-10 harmful events
accident_harmful_events_top10_percentage = np.round(accident_harmful_events_top10/total_accidents * 100, 2)
# mapping between harmful evenet code and text
map_harmful_events = {12:'Motor Vehicle in Transport', 8:'Pedestrian', 1:'Rollover/Overturn', 42:'Tree', 33:'Curb',
34:'Ditch', 35:'Embankment', 9:'Pedalcyclist', 24:'Guardrail Face', 30:'Utility Pole/Light Support'}
# number of persons involved grouped by harmful events
persons_harmful_events_top10 = accident_data.groupby('Harmful_event')['Persons'].sum()[accident_harmful_events_top10_percentage.index]
# number of fatals grouped by harmful events
fatals_harmful_events_top10 = accident_data.groupby('Harmful_event')['Fatals'].sum()[accident_harmful_events_top10_percentage.index]
# the text for the top-10 harmful events
harmful_event_categories = pd.Series(accident_harmful_events_top10.index).map(map_harmful_events).values
# area of the circle: accident percentage by harmful events
harmful_event_count = accident_harmful_events_top10.values
harmful_event_percent = accident_harmful_events_top10_percentage.values
# y-axis: the number of fatals by harmful events
harmful_event_fatals = fatals_harmful_events_top10.values
# log scale for y-axis
harmful_event_yaxis = np.log10(harmful_event_fatals)
# x-axis: the number of persons involved by harmful events
harmful_event_persons = persons_harmful_events_top10.values
# log-scale for x-axis
harmful_event_xaxis = np.log10(harmful_event_persons)
# the display text when clicked
harmful_event_text = []
for i in range(0, len(harmful_event_count)):
harmful_event_text.append(harmful_event_categories[i] + ' (' + harmful_event_percent[i].astype(str)
+ '%)<br>' + harmful_event_fatals[i].astype(str) + ' Killed, '
+ harmful_event_persons[i].astype(str) + ' Involved')
data = [go.Scatter(
x = harmful_event_persons,
y = harmful_event_fatals,
text = harmful_event_text,
mode = 'markers',
hoverinfo = 'text',
marker = dict(
size = (harmful_event_count) / 100,
opacity = 0.9,
color = 'rgb(240, 140, 45)')
)]
# layout: log scale for both x-axis and y-axis
layout = go.Layout(
title = 'Number of Injured Persons by Top-10 Harmful Events in USA in 2015<br>'
'<sub>Hover to view the details</sub>',
xaxis = dict(
title = 'Persons Involved',
type = 'log',
#range = [0.45, 3.51],
tickmode = 'auto',
nticks = 4,
showline = True,
showgrid = False
),
yaxis = dict(
title = 'Fatals',
type = 'log',
#range = [0.65, 3.33],
tickmode = 'auto',
nticks = 3,
showline = True,
showgrid = False)
)
# annotation is the text below the solid circle
annotations = []
for i in range(0, 10):
annotations.append(dict(x = harmful_event_xaxis[i], y = harmful_event_yaxis[i],
xanchor='middle', yanchor='top',
text=harmful_event_categories[i],
showarrow=False
))
#layout['annotations'] = annotations
figure = dict(data = data, layout = layout)
iplot(figure)
# accidents per date
accident_total_perdate = accident_data.groupby('Date')['Case_id'].count().values
# accidents caused by drunk drivers
accident_drunk_drivers = accident_data[accident_data['Drunk_drivers'] > 0]
# accidents caused by drunk drivers per date
accident_drunk_drivers_perdate = accident_drunk_drivers.groupby('Date')['Case_id'].count().values
# percentage of the accidents caused by drunk drivers per date
accident_drunk_drivers_perdate_percentage = np.round(np.divide(accident_drunk_drivers_perdate, accident_total_perdate) * 100, 1)
# accidents caused by sober drivers
accident_sober_drivers = accident_data[accident_data['Drunk_drivers'] == 0]
# accidents caused by sober drivers per date
accident_sober_drivers_perdate = accident_sober_drivers.groupby('Date')['Case_id'].count().values
# percentage of the accidents caused by sober drivers per date
accident_sober_drivers_perdate_percentage = np.round(np.divide(accident_sober_drivers_perdate, accident_total_perdate) * 100, 1)
# numpy array of period of dates [2015-01-01, 2015-12-31]
accident_dates = np.arange('2015-01', '2016-01', dtype='datetime64[D]')
# labels
labels = ['Drunk drivers', 'Sober drivers']
# colors
colors = ['rgb(0, 0, 200)', 'rgb(200, 0, 0)']
# x-axis
x_data = accident_dates
# y-axis
y_data = np.asarray([accident_drunk_drivers_perdate_percentage, accident_sober_drivers_perdate_percentage])
traces = []
for i in range(0, 2):
# graph-object
traces.append(go.Scatter(
x = x_data,
y = y_data[i],
mode = 'lines',
name = labels[i],
line = dict(color = colors[i], width = 3)
))
layout = go.Layout(
title = 'Accidents by Drunk or Sober Drivers per Date in USA in 2015<br>'
'<sub>Hover to view the details</sub>',
showlegend = True,
xaxis = dict(
showline = True,
showgrid = True
),
yaxis = dict(
ticksuffix = '%',
showline = True,
zeroline = False,
showgrid = True,
showticklabels = True,
range = [0., 100]
),
margin = dict(
autoexpand = True,
l = 127, r = 38)
)
annotations = []
for y_trace, label in zip(y_data, labels):
annotations.append(dict(xref='paper', x=0.0475, y=y_trace[0],
xanchor='right', yanchor='middle',
text=label + ' {}%'.format(y_trace[0]),
showarrow=False))
annotations.append(dict(xref='paper', x=0.9525, y=y_trace[50],
xanchor='left', yanchor='middle',
text='{}%'.format(y_trace[50]),
showarrow=False))
annotations[1].update(yanchor='top')
annotations[3].update(yanchor='bottom')
#layout['annotations'] = annotations
figure = dict(data = traces, layout = layout)
iplot(figure)